R coding to calculate standardised rates and ratios from cancer registry data

Author

Anshu Uppal

Published

April 28, 2025

Code
# install.packages("pacman")
pacman::p_load(
        here,
        tidyverse,
        # Rcan, # package for Cancer Registry Data Analysis and Visualisation
        PHEindicatormethods, # package for standardising rates
        # sf, # for spatial functions
        DT # package for table formatting and styling
)

# Read in the pre-processed data -------------------------------------

# source(here("code", "01_prep.R"))
# Most recent data, from CI5-XII
swiss_XII <- readRDS(here("data", "generated_data", "swiss_XII.rds"))
# National data, from CI5-XII
swiss_ref_XII <- readRDS(here("data", "generated_data", "swiss_ref_XII.rds"))
# Annual data, from CI5-I to CI5-XII
swiss_plus <- readRDS(here("data", "generated_data", "swiss_plus.rds"))
# # Canton polygons data
# canton_boundaries <- readRDS(here("data", "generated_data", "canton_boundaries.rds"))
Code
/* set DT table fontsizes */
th { font-size: 11px; } /* header font */
td { font-size: 11px; } /* cell font */

CI5 XII (2013-2017)

This section uses data from Volume XII of the IARC’s “Cancer Incidence in Five Continents (CI5)” collaboration: https://ci5.iarc.fr/ci5-xii/download. The full dataset was then restricted to data from Switzerland-based cancer registries only.

World and European Standard Populations were downloaded from: https://www.opendata.nhs.scot/dataset/standard-populations

The data pre-processing code can be found in the 01_prep.R file on the GitHub repository for this page.

Direct standardisation for age-standardised rates

Note

When using the calculate_dsr() function from the PHEindicatormethods package, when the cases within the strata are less than 10 the DSR is suppressed (“When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output”).

Code
# "When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output." (from the `PHEindicatormethods::calculate_dsr()` funtion)

## European Standard Population
dsr_swiss_ESP <- swiss_XII |> 
        # filter(
        #         cancer_label %in% c("Colon")
        #         , id_region %in% c("Geneva", "Vaud")
        # ) |> 
        # Add grouping variables of interest
        group_by(period, cancer_label, id_region, sex) |> 
        PHEindicatormethods::calculate_dsr(
                x = cases, # field containing count of events
                n = py, # field containing population denominators
                stdpop = EuropeanStandardPopulation, # field containing standard populations
                type = "standard"
        ) |> 
        mutate(crude_rate = total_count/total_pop*100000) |> 
        relocate(crude_rate, .before = value) |> 
        rename(cases = total_count, ASRate = value, lower_CI = lowercl, upper_CI = uppercl) |> 
        mutate(across(c(crude_rate:upper_CI), ~ round(.x, 2)))
Code
dsr_swiss_ESP |> 
        filter(cancer_label %in% c("All sites", "All sites but skin", "Other skin")) |> 
        ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+
        geom_col(position = position_dodge(width = 0.4))+
        theme_bw()+
        labs(x = "Age-standardised incidence rate, per 100,000", 
             y = NULL,
             color = NULL, fill = NULL)+
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.4))+
        facet_wrap(.~cancer_label, 
                   labeller = label_wrap_gen(),
                   ncol = 5)+
        theme(legend.position = "top")

Code
dsr_swiss_ESP |> 
        filter(!cancer_label %in% c("All sites", "All sites but skin", "Other skin")) |> 
        # filter(cancer_label %in% c("Breast", "Colon", "Adrenal gland")) |>
        ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+
        geom_col(position = position_dodge(width = 0.4))+
        theme_bw()+
        labs(x = "Age-standardised incidence rate, per 100,000", 
             y = NULL,
             color = NULL, fill = NULL)+
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.4))+
        facet_wrap(.~cancer_label, 
                   labeller = label_wrap_gen(),
                   ncol = 5)+
        theme(legend.position = "top")

Code
dsr_swiss_ESP |> 
        DT::datatable(
                filter = "top",
                options = list(
                        pageLength = 26 
                ),
                rownames = FALSE, # set to FALSE for cleaner look
                class = 'cell-border stripe hover nowrap compact'
        )
Code
# "When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output." (from the `PHEindicatormethods::calculate_dsr()` funtion)

## European Standard Population
dsr_swiss_WSP <- swiss_XII |> 
        # filter(
        #         cancer_label %in% c("Colon")
        #         , id_region %in% c("Geneva", "Vaud")
        # ) |> 
        # Add grouping variables of interest
        group_by(period, cancer_label, id_region, sex) |> 
        PHEindicatormethods::calculate_dsr(
                x = cases, # field containing count of events
                n = py, # field containing population denominators
                stdpop = WorldStandardPopulation, # field containing standard populations
                type = "standard"
        ) |> 
        mutate(crude_rate = total_count/total_pop*100000) |> 
        relocate(crude_rate, .before = value) |> 
        rename(cases = total_count, ASRate = value, lower_CI = lowercl, upper_CI = uppercl) |> 
        mutate(across(c(crude_rate:upper_CI), ~ round(.x, 2)))
Code
dsr_swiss_WSP |> 
        filter(cancer_label %in% c("All sites", "All sites but skin", "Other skin")) |> 
        ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+
        geom_col(position = position_dodge(width = 0.4))+
        theme_bw()+
        labs(x = "Age-standardised incidence rate, per 100,000", 
             y = NULL,
             color = NULL, fill = NULL)+
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.4))+
        facet_wrap(.~cancer_label, 
                   labeller = label_wrap_gen(),
                   ncol = 5)+
        theme(legend.position = "top")

Code
dsr_swiss_WSP |> 
        filter(!cancer_label %in% c("All sites", "All sites but skin", "Other skin")) |> 
        # filter(cancer_label %in% c("Breast", "Colon", "Adrenal gland")) |>
        ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+
        geom_col(position = position_dodge(width = 0.4))+
        theme_bw()+
        labs(x = "Age-standardised incidence rate, per 100,000", 
             y = NULL,
             color = NULL, fill = NULL)+
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.4))+
        facet_wrap(.~cancer_label, 
                   labeller = label_wrap_gen(),
                   ncol = 5)+
        theme(legend.position = "top")

Code
dsr_swiss_WSP |> 
        DT::datatable(
                filter = "top",
                options = list(
                        pageLength = 26 
                ),
                rownames = FALSE, # set to FALSE for cleaner look
                class = 'cell-border stripe hover nowrap compact'
        )

By age (crude rates only)

Code
age_crude <- swiss_XII |> 
        group_by(cancer_label, id_region, age_ESP, sex) |> 
        summarise(crude_rate = cases/py*100000, across()) |> 
        select(cancer_label:crude_rate, cases, py, period) |> 
        mutate(crude_rate = round(crude_rate, 2))
Code
age_crude |> 
        # Remove the text " years" while keeping the factor structure
        mutate(age_ESP = fct_relabel(age_ESP, ~ str_remove(.x, " years"))) |> 
        filter(id_region %in% c("Geneva")) |> 
        # filter(cancer_label %in% c("All sites", "Colon", "Breast")) |> 
        ggplot(aes(x=age_ESP, y = crude_rate, color = sex, group = sex))+
        geom_path()+
        facet_wrap(.~cancer_label, scales = "free_y",
                   labeller = label_wrap_gen(),
                   ncol = 6
                   )+
        labs(y = "Crude incidence rate per 100,000", x = "Age group", color = NULL)+
        theme_bw()+
        # Show only every third label on x-axis
        scale_x_discrete(breaks = \(x)x[seq(3, length(x), by = 3)])+
        theme(axis.text.x = element_text(angle = 50, vjust = 0.8),
              legend.position = "top")

Code
# Format it as a table
age_crude |>     
        filter(sex == "Male") |> 
DT::datatable(
        filter = "top",
        options = list(
                pageLength = 18 
        ),
        rownames = FALSE, # set to FALSE for cleaner look
        class = 'cell-border stripe hover nowrap compact'
)
Code
age_crude |>     
        filter(sex == "Female") |> 
        DT::datatable(
                filter = "top",
                options = list(
                        pageLength = 18 
                ),
                rownames = FALSE, # set to FALSE for cleaner look
                class = 'cell-border stripe hover nowrap compact'
        )

Indirect standardisation

During the pre-processing, I aggregated the data to the level of the whole of Switzerland, per cancer type and summing the cases and person-years by age and sex. These national reference cases and person-years are then used below to calculate indirectly standardised rates and ratios for each canton for each cancer type, by sex.

Indirectly standardised rates

Code
ind.rate <-
        swiss_XII |> 
        group_by(cancer_label, sex, id_region) |> 
        PHEindicatormethods::calculate_ISRate(
                x = cases,
                n = py,
                x_ref = cases_ref, # national cases as reference
                n_ref = py_ref, # national py as reference
                refpoptype = "field"
        ) |> 
        rename(ISRate = value, lower_CI = lowercl, upper_CI = uppercl) |> 
        mutate(across(c(expected:upper_CI), ~ round(.x, 2)))

# Generate the table
ind.rate |> 
        select(-c(confidence:method)) |> 
        DT::datatable(
                caption = "Indirectly standardised rate per 100,000, with 95% CI",
                filter = "top",
                options = list(
                        pageLength = 14 
                ),
                rownames = FALSE, # set to FALSE for cleaner look
                class = 'cell-border stripe hover nowrap compact'
        )

Indirectly standardised ratios

Code
ind.ratio <-
        swiss_XII |> 
        group_by(cancer_label, sex, id_region) |> 
        PHEindicatormethods::calculate_ISRatio(
                x = cases,
                n = py,
                x_ref = cases_ref, # national cases as reference
                n_ref = py_ref, # national py as reference
                refpoptype = "field"
        ) |> 
        rename(ISRatio = value, lower_CI = lowercl, upper_CI = uppercl) |> 
        mutate(across(c(expected:upper_CI), ~ round(.x, 2)))

(for a select few cancer types)

Code
ind.ratio |> 
        filter(cancer_label %in%
                       c("All sites", "Colon", "Rectum",
                         "Bladder", "Prostate", "Breast",
                         "Bone", "Vulva", "Lung (incl. trachea and bronchus)"
                       )) |>
        mutate(est_ci_label = sprintf("%.2f [%.2f-%.2f]", 
                                      ISRatio, lower_CI, upper_CI)
        ) |> 
        # arrange(id_region, sex) |> 
        # Start the plot
        ggplot(aes(x = ISRatio, y = id_region, color = sex))+
        # Add CI
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI), 
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.5)
        )+
        # Add point estimate
        geom_point(size = 1.5, 
                   position = position_dodge(width = 0.5))+
        # Add vertical reference line at ISRatio = 1
        geom_vline(xintercept = 1, linetype = "dashed", color = "grey50")+
        # Separate plots by cancer type
        facet_wrap(~ cancer_label, 
                   labeller = label_wrap_gen(),
                   ncol = 3
                   # scales = "free_x" # Use free_y so regions aren't duplicated across facets
        )+
        # Add labels and title
        labs(
                title = "Indirectly Standardized Ratio by Canton and Sex", 
                subtitle = "(reference: national data for Switzerland)",
                x = "IS Ratio (95% CI)",
                y = NULL,
                color = NULL # Legend title
        ) +
        # Use a clean theme
        theme_bw(base_size = 12) +
        theme(
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                # strip.text = element_text(face = "bold"), # Make facet titles bold
                axis.text.y = element_text(size = 12),    # Adjust y-axis text size if needed
                # panel.spacing.y = unit(1, "lines"),      # Add space between facets
                legend.position = "top"                  # Position legend
        )

Code
ind.ratio |> 
        filter(id_region %in%c("Geneva")) |>
        mutate(est_ci_label = sprintf("%.2f [%.2f-%.2f]", 
                                      ISRatio, lower_CI, upper_CI)
        ) |> 
        # arrange(id_region, sex) |> 
        # Start the plot
        ggplot(aes(x = ISRatio, y = cancer_label, color = sex))+
        # Add CI
        geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI), 
                       height = 0.4, 
                       linewidth = 0.5,
                       position = position_dodge(width = 0.5)
        )+
        # Add point estimate
        geom_point(size = 1.5, 
                   position = position_dodge(width = 0.5))+
        # Add vertical reference line at ISRatio = 1
        geom_vline(xintercept = 1, linetype = "dashed", color = "grey50")+
        # Separate plots by cancer type
        facet_wrap(~ id_region, 
                   labeller = label_wrap_gen(),
                   ncol = 3
                   # scales = "free_x" # Use free_y so regions aren't duplicated across facets
        )+
        # Add labels and title
        labs(
                title = "Indirectly Standardized Ratio by Canton and Sex", 
                subtitle = "(reference: national data for Switzerland)",
                x = "IS Ratio (95% CI)",
                y = NULL,
                color = NULL # Legend title
        ) +
        # Use a clean theme
        theme_bw(base_size = 12) +
        theme(
                panel.grid.major.x = element_blank(),
                panel.grid.minor.x = element_blank(),
                # strip.text = element_text(face = "bold"), # Make facet titles bold
                axis.text.y = element_text(size = 12),    # Adjust y-axis text size if needed
                # panel.spacing.y = unit(1, "lines"),      # Add space between facets
                legend.position = "top"                  # Position legend
        )

Code
# Generate the table
ind.ratio |> 
        select(-c(confidence:method)) |> 
        DT::datatable(
                caption = "indirectly standardised ratio x 1, with 95% CI",
                filter = "top",
                options = list(
                        pageLength = 13 
                ),
                rownames = FALSE, # set to FALSE for cleaner look
                class = 'cell-border stripe hover nowrap compact'
        )